In this document, we will analyse RNAseq data (counts at gene level from salmon) using DESeq2. The data comes from 6 populations (of 2 geographic regions) and each population was subjected to benign conditions (constant high water level) and stressful conditions (low water level).
Set working directory and load libraries
#setwd("~/Desktop/rnaseq/")
setwd("~/Documents/students_MSc/clara/rnaseq/")
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fdrtool)
library(UpSetR)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
Import counts prepared with tximport and clean
txi<-readRDS("./salmon_gene_counts.rds")
Remove non-coding regions and baits
# remove mtDNA, non-coding and nr baits
# make a list of genes we want to keep
whitelist<-txi$counts %>%
as_tibble(rownames = "gene_id") %>%
filter(!str_detect(gene_id, pattern = "mt|nr|nc")) %>%
pull(gene_id)
length(whitelist);head(whitelist) # we are keeping 32531 genes
## [1] 32531
## [1] "PECUL23A000004" "PECUL23A000005" "PECUL23A000006" "PECUL23A000008"
## [5] "PECUL23A000011" "PECUL23A000012"
# filter txi tables
txi$abundance<-txi$abundance[whitelist,]
txi$counts<-txi$counts[whitelist,]
txi$length<-txi$length[whitelist,]
Load the “column data” for the dds object.
### load design matrix
des.mat<-read_csv("./design_matrix.csv")
## Rows: 48 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_id, treatment, population, region
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# re order factor levels
des.mat <- des.mat %>%
mutate(population=factor(population, levels=c("Bui","Can","Tur","Esp","Jab","Lla"))) %>% # re-order factors for easy plotting later
mutate(pop_n = factor(rep(rep(1:3,each=8),2))) %>% # make new, non-nested pop variable !!! BECAREFUL WITH THE ORDER OF VARIABLES
mutate_if(is.character, as.factor) # convert characters to factor
# filter samples that had substantially lower library size (Jab) and a potential outlier (Bui)
des.mat<-des.mat %>%
filter(!sample_id %in% c("Bui4H14_nonrrna","Jab5H6_nonrrna")) # could also remove: "Tur2H6_nonrrna","Bui1L9_nonrrna"
# filter txi tables
txi$abundance<-txi$abundance[,as.character(des.mat$sample_id)]
txi$counts<-txi$counts[,as.character(des.mat$sample_id)]
txi$length<-txi$length[,as.character(des.mat$sample_id)]
## get column order of counts matrix and re-order des.mat to match
col_order<-match(colnames(txi$counts),des.mat$sample_id)
des.mat<-des.mat[col_order,]
des.mat$sample_id==colnames(txi$counts)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE
des.mat
## # A tibble: 46 × 5
## sample_id treatment population region pop_n
## <fct> <fct> <fct> <fct> <fct>
## 1 Bui1H9_nonrrna H Bui central 1
## 2 Bui1L9_nonrrna L Bui central 1
## 3 Bui2H12_nonrrna H Bui central 1
## 4 Bui2L1_nonrrna L Bui central 1
## 5 Bui3H10_nonrrna H Bui central 1
## 6 Bui3L8_nonrrna L Bui central 1
## 7 Bui4L4_nonrrna L Bui central 1
## 8 Can1H4_nonrrna H Can central 2
## 9 Can1L1_nonrrna L Can central 2
## 10 Can2H12_nonrrna H Can central 2
## # ℹ 36 more rows
Make deseq object. Our primary object is to test what the effect of dropping the water level on tadpoles results is on tadpoles from the south or central populations. We therefore want to contrast high vs. low water for each of the regions and so we have to include a region:treatment interaction effect. However, we are expecting region alone to be an important factor affecting gene expression (almost like a batch effect) and we also want to correct for potential minor differences of having sampled different populations within each regions. See a similar setup here: https://support.bioconductor.org/p/87324/.
Here, the formula is constructed in such a way that populations are treated as replicates per region. In order for this to work, we have to re-code the populations as 1 to 3 within each region (see chunk above). Otherwise, each population/replicate is unique to each region and will throw back an error.
We drop the intercept (first term in the formula is zero), because this allows us to more easily compare contrasts. If not, the base level expression would be that of the intercept, which is more difficult to intepret for such a complex model design. Read more here: https://www.biostars.org/p/395926/
dds<-DESeqDataSetFromTximport(txi,
colData = des.mat,
design = ~ 0+region + region:pop_n + region:treatment)
## using counts and average transcript lengths from tximport
# make sure metadata is in matching order
des.mat$sample_id==colnames(assay(dds))
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE
This is an ambiguous step. It may be worth playing around with how this affects the final outcome.
rowSums(counts(dds)) %>%
enframe() %>%
ggplot(aes(x=value)) +
geom_histogram() +
scale_x_log10()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8426 rows containing non-finite outside the scale range
## (`stat_bin()`).
# lets apply 2 different filters
dds1<-dds[rowSums(counts(dds) >= 1) >= 12,] # genes that have at least 1 count for 12 samples (e.g. one per treatment per population)
dim(dds1) # we are left with ~18k genes
## [1] 17987 46
dds2<-dds[rowSums(counts(dds) >= 50) >= 12,] # genes that have at least 10 counts for 12 samples (e.g. one per treatment per population)
dim(dds2) # we are left with ~14k genes
## [1] 11657 46
Its always a good idea to plot a PCA of the counts data.
Although for the actual DE expression we use raw count data, for visualization or clustering, it makes more sense to use some sort of transformed count. DESeq authors argue that variance stabilized transformation (VST) or regularized logarithm (rlog) have certain advantages. VST is faster, i.e. good for large datasets, rlog is more often used for small datasets. lets do vst
# vst transformation
vst_dds<-vst(dds, blind = T) # no filtering
## using 'avgTxLength' from assays(dds), correcting for library size
vst_dds1<-vst(dds1, blind = T) # mild filtering
## using 'avgTxLength' from assays(dds), correcting for library size
vst_dds2<-vst(dds2, blind = T) # hard filtering
## using 'avgTxLength' from assays(dds), correcting for library size
# perform PCA on TRANSPOSED scaled, centered data
vst_pca<- prcomp(t(assay(vst_dds)), center = T)
vst_pca1<- prcomp(t(assay(vst_dds1)), center = T)
vst_pca2<- prcomp(t(assay(vst_dds2)), center = T)
To plot multiple axes at once, I am going to use paired plots. The densities on the diagonal are also useful for seeing which PC does the best job at separating out treatments.
vst_pca$x %>%
as_tibble(rownames = "sample_id") %>%
left_join(des.mat) %>% # add the experimental design information
ggpairs(columns = c("PC1", "PC2", "PC3", "PC4","PC5","PC6"), aes(color=region, shape=treatment)) +
ggtitle("VST PCA (no filtering)")
## Joining with `by = join_by(sample_id)`
## Warning in geom_point(): All aesthetics have length 1, but the data has 36 rows.
## ℹ Did you mean to use `annotate()`?
vst_pca1$x %>%
as_tibble(rownames = "sample_id") %>%
left_join(des.mat) %>% # add the experimental design information
ggpairs(columns = c("PC1", "PC2", "PC3", "PC4","PC5","PC6"), aes(color=region, shape=treatment)) +
ggtitle("VST PCA (mild liftering)")
## Joining with `by = join_by(sample_id)`
## Warning in geom_point(): All aesthetics have length 1, but the data has 36 rows.
## ℹ Did you mean to use `annotate()`?
vst_pca2$x %>%
as_tibble(rownames = "sample_id") %>%
left_join(des.mat) %>% # add the experimental design information
ggpairs(columns = c("PC1", "PC2", "PC3", "PC4","PC5","PC6"), aes(color=region, shape=treatment)) +
ggtitle("VST PCA (hard liftering)")
## Joining with `by = join_by(sample_id)`
## Warning in geom_point(): All aesthetics have length 1, but the data has 36 rows.
## ℹ Did you mean to use `annotate()`?
#vst_pca2$x[,colnames(vst_pca$x)=="PC5"]<(-40)
# using canned function from DESeq2
#plotPCA(vst_dds, intgroup=c("region"), ntop=Inf) +
# coord_fixed()
Here we can quite clearly see that the strongest effect is that of the region. PC1 separates them out quite neatly.
Filtering doesn’t have a noticeable effect.
vst_pca1$x %>%
as_tibble(rownames = "sample_id") %>%
left_join(des.mat) %>% # add the experimental design information
ggpairs(columns = c("PC1", "PC2", "PC3", "PC4","PC5","PC6"), aes(color=treatment, shape=region)) +
ggtitle("VST PCA")
## Joining with `by = join_by(sample_id)`
## Warning in geom_point(): All aesthetics have length 1, but the data has 36 rows.
## ℹ Did you mean to use `annotate()`?
Plotting the same PCA, but colouring treatments rather than regions, shows that we have to go down to PC3 to start getting treatment separations. PC1 vs PC3 may give us the best separation of both region and treatment.
We can plot specific axes again to highlight some of this more clearly.
# calculate hulls
pca_hull <-
vst_pca1$x %>%
as_tibble(rownames = "sample_id") %>%
left_join(des.mat) %>%
group_by(population, treatment) %>%
dplyr::slice(chull(PC1, PC3))
## Joining with `by = join_by(sample_id)`
# plot PCA and facet into different populations
vst_pca1$x %>%
as_tibble(rownames = "sample_id") %>%
left_join(des.mat) %>% # add the experimental design information
ggplot(aes(x=PC1, y=PC3, color=population, shape=treatment)) +
geom_hline(yintercept = 0, linewidth=0.1) +
geom_vline(xintercept = 0, linewidth=0.1) +
geom_point(size=3) +
geom_polygon(data = pca_hull,
aes(fill = population,
colour = population,
linetype=treatment),
alpha = 0.1,
show.legend = FALSE) +
ggrepel::geom_text_repel(aes(label=sample_id),size=2) +
scale_color_manual(values=c("Bui"="#900C3F", "Can"="#FF5733", "Tur"="#FFC300",
Esp="#0FBA3D", "Jab"="#0FBA93", "Lla"="#0F8CBA"))+
scale_fill_manual(values=c("Bui"="#900C3F", "Can"="#FF5733", "Tur"="#FFC300",
Esp="#0FBA3D", "Jab"="#0FBA93", "Lla"="#0F8CBA"))+
facet_wrap(~population) +
theme_minimal()
## Joining with `by = join_by(sample_id)`
we can just about see that the warm colours (central populations) are found on the left side of the plotting area and the green colours (southern) on the right side. So PC1 is splitting regions quite well. the Y axis (PC3) on the other hand is fairly consistently splitting the triangles from the squares (except for maybe in Tur). So we are getting a consistent treatment signal too.
we can now run the DESeq pipeline, which will automatically run a number of steps including controlling for different library sizes etc, and finally also fitting the generalized linear model.
dds <- DESeq(dds)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds1 <- DESeq(dds1)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds2 <- DESeq(dds2)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds1) # there are many coefficients (effects), due to the population replicates, but we are not really interested in all of these
## [1] "regioncentral" "regionsouth"
## [3] "regioncentral.pop_n2" "regionsouth.pop_n2"
## [5] "regioncentral.pop_n3" "regionsouth.pop_n3"
## [7] "regioncentral.treatmentL" "regionsouth.treatmentL"
# compares Low vs High (high is the reference level for treatment) in the central region
res_treat_central<-results(dds1, name="regioncentral.treatmentL")
summary(res_treat_central, alpha=0.05)
##
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 88, 0.49%
## LFC < 0 (down) : 5, 0.028%
## outliers [1] : 124, 0.69%
## low counts [2] : 3104, 17%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# compares Low vs High (high is the reference level for treatment) in the central region
res_treat_south<-results(dds1, name="regionsouth.treatmentL")
summary(res_treat_south, alpha=0.05)
##
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 443, 2.5%
## LFC < 0 (down) : 540, 3%
## outliers [1] : 124, 0.69%
## low counts [2] : 3445, 19%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# lets combine them into a list
res_dds<-list("central"=res_treat_central,
"south"=res_treat_south)
Relatively low numbers of DEGs. We should diagnose our models
# lets look at the P-value distributions
par(mfrow=c(2,1))
for(i in 1:length(res_dds)){
hist(res_dds[[i]]$pvalue, breaks=40, col="grey", main=names(res_dds)[i])
}
par(mfrow=c(1,1))
They look OK
# plot
res_dds %>%
lapply(as_tibble,rownames = "gene_id") %>%
bind_rows(.id="population") %>%
drop_na(padj) %>% # drop all genes with NAs
filter(padj<0.1) %>%
mutate(updown=ifelse(log2FoldChange>0, "up", "down")) %>%
group_by(population, updown) %>%
summarise(n=n()) %>%
mutate(n=ifelse(updown=="down", n*-1, n)) %>%
ggplot(aes(x=population, y=n, fill=updown)) +
geom_bar(stat="identity") +
theme_bw() +
theme(legend.position = "none")
## `summarise()` has grouped output by 'population'. You can override using the
## `.groups` argument.
in general, p values should be evenly distributed, with an inflation of p=0. Common “bad” distributions include “U-shaped” distributions or “hill shaped” distributions. The present distribution is not bad, but we can see if it would be better using a local fit instead of a parametric fit for the dispersion.
# Whether a gene is called significant depends not only on its LFC but also on its within-group variability, which DESeq2 quantifies as the dispersion.
## we can inspect the dispersion of a (default) parametric fit vs a local fit
disp.par <- estimateDispersions(dds1, fitType = "parametric")
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
disp.loc <- estimateDispersions(dds1, fitType = "local")
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
par(mfrow=c(2,1))
plotDispEsts(disp.par, main= "dispEst: parametric")
plotDispEsts(disp.loc, main= "dispEst: local")
par(mfrow=c(1,1))
### calculate median of absolute residuals
median(abs(log(mcols(disp.par)$dispGeneEst) - log(mcols(disp.par)$dispFit)))
## [1] 1.020538
median(abs(log(mcols(disp.loc)$dispGeneEst) - log(mcols(disp.loc)$dispFit)))
## [1] 0.7595821
## it seems like the local fit is a little better (lower mean absolute residuals, suggesting that that the distance of the residuals to the best fit line is lower)
### we can calculate the new results using local fit type
dds.loc<-DESeq(dds1, fitType = "local")
## using pre-existing normalization factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds.loc)
## [1] "regioncentral" "regionsouth"
## [3] "regioncentral.pop_n2" "regionsouth.pop_n2"
## [5] "regioncentral.pop_n3" "regionsouth.pop_n3"
## [7] "regioncentral.treatmentL" "regionsouth.treatmentL"
# compares Low vs High (high is the reference level for treatment) in the central region
res_loc_treat_central<-results(dds.loc, name="regioncentral.treatmentL")
summary(res_loc_treat_central, alpha=0.05)
##
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 93, 0.52%
## LFC < 0 (down) : 6, 0.033%
## outliers [1] : 124, 0.69%
## low counts [2] : 2080, 12%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# compares Low vs High (high is the reference level for treatment) in the central region
res_loc_treat_south<-results(dds.loc, name="regionsouth.treatmentL")
summary(res_loc_treat_south, alpha=0.05)
##
## out of 17987 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 478, 2.7%
## LFC < 0 (down) : 562, 3.1%
## outliers [1] : 124, 0.69%
## low counts [2] : 3104, 17%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# lets combine them into a list
res_dds_loc<-list("central"=res_loc_treat_central,
"south"=res_loc_treat_south)
par(mfrow=c(2,1))
for(i in 1:length(res_dds_loc)){
hist(res_dds_loc[[i]]$pvalue, breaks=40, col="grey", main=names(res_dds_loc)[i])
}
par(mfrow=c(1,1))
In general, very similar results, though perhaps a slight improvement
# plot
res_dds_loc %>%
lapply(as_tibble,rownames = "gene_id") %>%
bind_rows(.id="population") %>%
drop_na(padj) %>% # drop all genes with NAs
filter(padj<0.1) %>%
mutate(updown=ifelse(log2FoldChange>0, "up", "down")) %>%
group_by(population, updown) %>%
summarise(n=n()) %>%
mutate(n=ifelse(updown=="down", n*-1, n)) %>%
ggplot(aes(x=population, y=n, fill=updown)) +
geom_bar(stat="identity") +
theme_bw() +
theme(legend.position = "none")
## `summarise()` has grouped output by 'population'. You can override using the
## `.groups` argument.
From the pvalue distributions, we can say that perhaps our data is not fitting the default negative binomial Wald test assumptions. Under this test, the p-value distribution is expected to be flat with an enrichment in low p-values. As we don’t recover this distribution, we have to conclude that the default DESeq analysis is overestimating the variance in our data.
To alleviate this, we have to instead estimate our own null distribution which we can do with the fdrtool package. You can read more about this here: https://seqqc.wordpress.com/2016/01/27/too-few-or-none-differential-expressed-genes-a-way-to-fix-the-null-hypothesis-in-deseq2/
# first we have to filter out all genes with NA p-values (fdrtool doesnt like those) and then run fdrtools
res_fdr_dds<-res_dds
res_fdrtools<-list()
for(i in 1:length(res_fdr_dds)){
# 1. remove genes that have been filtered out by DESeq2's independent filtering
res_fdr_dds[[i]]<-res_fdr_dds[[i]][ !is.na(res_fdr_dds[[i]]$padj), ]
# 2. remove genes that have been identified by DESeq2 as outliers
res_fdr_dds[[i]]<-res_fdr_dds[[i]][ !is.na(res_fdr_dds[[i]]$pvalue), ]
# 3. use z-scores as input to FDRtool to re-estimate the p-value
res_fdrtools[[i]]<-fdrtool(res_fdr_dds[[i]]$stat, statistic= "normal", plot = F)
# 4. overwrite old with new fdr values
res_fdr_dds[[i]]$padj <- p.adjust(res_fdrtools[[i]]$pval, method = "BH")
}
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
Lets look at these new p values
# New diagnostic plots
par(mfrow=c(2,1))
for(i in 1:length(res_fdrtools)){
hist(res_fdrtools[[i]]$pval, breaks=40, col="grey", main=names(res_dds)[i])
}
par(mfrow=c(1,1))
These look a little better, though notice the more even dispersion of pvalues (i.e. less relative enrichment of low p values)
## lets see how many DEGs there are now
lapply(res_fdr_dds, summary, alpha=0.05)
##
## out of 14759 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 57, 0.39%
## LFC < 0 (down) : 3, 0.02%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
##
## out of 14418 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 0, 0%
## LFC < 0 (down) : 0, 0%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## $central
## NULL
##
## $south
## NULL
We have essentially lost all DEGs for south, and most of north. Seems very strange
Rather than just the numbers of genes, we also want to see how many shared sets of genes the different populations have in their sets of DEGs.
# make function to extract lists of DEGs per population
extract_degs<-function(x) {
return(
x %>%
as_tibble(rownames = "gene") %>%
filter(padj<0.05) %>%
pull(gene)
)
}
# extract all
sig_deg<-lapply(res_dds, FUN=extract_degs)
names(sig_deg)
## [1] "central" "south"
str(sig_deg)
## List of 2
## $ central: chr [1:93] "PECUL23A000127" "PECUL23A000194" "PECUL23A000652" "PECUL23A001362" ...
## $ south : chr [1:983] "PECUL23A000011" "PECUL23A000059" "PECUL23A000253" "PECUL23A000300" ...
# Plot Upset
upset(fromList(sig_deg),
nsets = length(sig_deg),
keep.order = T,
nintersects = 100,
number.angles = 0, point.size = 3, line.size = 1,
sets.x.label = "Number of DEGs",
set_size.show = TRUE,
set_size.scale_max = max(sapply(sig_deg, length))+50, # needed only to expand the axis a bit
text.scale = c(1.2, 1.2, 1.2, 1.2, 1.5, 1.5),
sets.bar.color = c("skyblue","chartreuse3"),
order.by=c("degree","freq"))
Most DEGs are unique to regions, but we get some genes that are overlapping.
We can also plot this as Venn diagrams, seeing as there are only two sets
library(ggVennDiagram)
library(scico)
sig_deg %>%
ggVennDiagram(edge_size = 0) +
scale_fill_scico(palette = "batlow") +
theme(legend.position="none")
# DEGS only differentially expressed in the central populations
assay(vst_dds1)[sig_deg$central[!sig_deg$central %in% intersect(sig_deg$central,sig_deg$south)],] %>%
as_tibble(rownames="gene_id") %>%
pivot_longer(-gene_id, names_to="sample_id", values_to = "vst") %>%
left_join(des.mat) %>%
ggplot(aes(x=treatment, y=vst)) +
stat_summary(aes(group = gene_id, color=gene_id), fun.y = mean, geom = "path") +
#stat_summary(aes(color = species), fun.data = mean_cl_boot, geom = "errorbar", width = 0.1) +
stat_summary(aes(color = gene_id), fun.y = mean, geom = "point", size = 2) +
# geom_point() +
facet_wrap(~population) +
scale_y_log10() +
theme(legend.position = "none")
## Joining with `by = join_by(sample_id)`
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# DEGS only differentially expressed in the south populations
assay(vst_dds1)[sig_deg$south[!sig_deg$south %in% intersect(sig_deg$central,sig_deg$south)],] %>%
as_tibble(rownames="gene_id") %>%
pivot_longer(-gene_id, names_to="sample_id", values_to = "vst") %>%
left_join(des.mat) %>%
ggplot(aes(x=treatment, y=vst)) +
stat_summary(aes(group = gene_id, color=gene_id), fun.y = mean, geom = "path") +
#stat_summary(aes(color = species), fun.data = mean_cl_boot, geom = "errorbar", width = 0.1) +
stat_summary(aes(color = gene_id), fun.y = mean, geom = "point", size = 2) +
# geom_point() +
facet_wrap(~population) +
scale_y_log10() +
theme(legend.position = "none")
## Joining with `by = join_by(sample_id)`
The differences in slopes between genes for the corresponding sets are not immediately obvious in my opinion
Lets export the results with the default null distribution and only light pre-filtering
# make a results folder if it does not yet exist
dir.create("results", showWarnings = FALSE)
# save best DESeq2 object
saveRDS(dds1, "./results/deseq2_regions_dds.rds")
# save results object
saveRDS(res_dds,
"./results/deseq2_regions_results.rds")